home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Celestin Apprentice 5
/
Apprentice-Release5.iso
/
Source Code
/
Libraries
/
DCLAP 6d
/
dclap6d
/
DBio
/
DSeqList.cpp
< prev
next >
Wrap
Text File
|
1996-07-05
|
14KB
|
621 lines
// DSeqList.cp
// d.g.gilbert
#include "DSeqList.h"
#include "DSequence.h"
#include "DSeqFile.h"
#include <ncbi.h>
#include <dgg.h>
#include <DList.h>
#include <DFile.h>
#include <ureadseq.h>
Global short gLinesPerSeqWritten = 0;
short DSeqList::gMinCommonPercent = 70;
short DSeqList::gMinORFsize = 20;
// DSeqList -------------------------------------
DSeqList::DSeqList() :
DList(),
fSortOrder(kSortByItem)
{
}
DSeqList::~DSeqList()
{
// PLUG MEMORY LEAK !! ~DList doesn't call object destructors !!!!
if (fDeleteObjects) {
long i, n= GetSize();
for (i=0; i<n; i++) {
DSequence* aseq= this->SeqAt(i);
delete aseq;
}
fDeleteObjects= false;
}
}
Boolean DSeqList::IsEmpty()
{
if (GetSize() > 1)
return FALSE;
else if (GetSize() == 1) {
DSequence* aSeq= (DSequence*) this->First();
return (aSeq && aSeq->LengthF() > 0);
}
else
return TRUE;
}
class CFileIndex
{
public:
long* fIndices;
long fMax, fNum;
CFileIndex();
~CFileIndex();
long* Indices() { return fIndices; }
long IndexCount() { return fNum; }
void Indexit( long index);
void Indexit( DFile* aFile);
void IndexEOF( DFile* aFile);
};
CFileIndex::CFileIndex()
{
fNum= 0;
fMax= 0;
fIndices= (long*) MemGet(sizeof(long),true);
}
CFileIndex::~CFileIndex()
{
if (fIndices) MemFree( fIndices);
}
void CFileIndex::Indexit( long index)
{
if (fNum >= fMax){
fMax += 20;
fIndices= (long*) MemMore( fIndices, sizeof(long)*fMax);
}
if (fIndices) fIndices[fNum++]= index;
}
void CFileIndex::Indexit( DFile* aFile)
{
Indexit(aFile->Tell());
}
void CFileIndex::IndexEOF( DFile* aFile)
{
Indexit(aFile->Tell());
fNum--;
}
void DSeqList::DoWrite( DFile* aFile, short format) // change to iostreams !!
{
Boolean needSameSize = false, sizesDiffer = false, isInterleaved = false;
short iseq, nseqs;
short seqtype = DSequence::kOtherSeq;
long minbases = -1;
long start, nbases;
DSequence * aSeq;
DFile * outfile;
DFile * tmpfile = NULL;
CFileIndex aFileIndex;
nseqs= this->GetSize();
outfile= aFile;
if (nseqs>1) // deal w/ one-per-file formats
switch (format) {
case DSeqFile::kGCG : format= DSeqFile::kMSF; break;
case DSeqFile::kStrider : format= DSeqFile::kIG; break;
case DSeqFile::kNoformat:
case DSeqFile::kPlain:
case DSeqFile::kUnknown : format= DSeqFile::kGenBank; break;
default : break;
}
//this->Each( findMinSize);
for (iseq= 0; iseq<nseqs; iseq++) {
aSeq= this->SeqAt(iseq);
aSeq->GetSelection( start, nbases);
seqtype= aSeq->Kind();
//if (minbases<0) minbases= nbases;
if (minbases<=0) minbases= nbases; // ?? temp fix problem w/ empty seq
else {
if (nbases!=minbases) sizesDiffer= true;
minbases= Min( nbases, minbases);
}
}
if (!gOutputName) gOutputName= (char*)aFile->GetShortname();
DSeqFile::WriteSeqHeader( aFile, format, nseqs, minbases, gOutputName, needSameSize, isInterleaved);
gOutputName= NULL;
if (isInterleaved) {
tmpfile= new DTempFile();
outfile= tmpfile;
}
if ((needSameSize || isInterleaved) && sizesDiffer) {
//this->Each( SetSameSize);
for (iseq= 0; iseq<nseqs; iseq++) {
aSeq= this->SeqAt(iseq);
aSeq->GetSelection( start, nbases);
aSeq->SetSelection( start, minbases);
}
}
if (gPretty.isactive && gPretty.numtop) {
aSeq= this->SeqAt(nseqs-1);
gPretty.numline = 1;
if (isInterleaved) aFileIndex.Indexit(outfile);
aSeq->DoWriteSelection( outfile, format);
gPretty.numline = 2;
if (isInterleaved) aFileIndex.Indexit(outfile);
aSeq->DoWriteSelection( outfile, format);
gPretty.numline = 0;
}
for (iseq= 0; iseq<nseqs; iseq++) {
aSeq= this->SeqAt(iseq);
if (isInterleaved) aFileIndex.Indexit(outfile);
aSeq->DoWriteSelection( outfile, format);
}
if (gPretty.isactive && gPretty.numbot) {
aSeq= this->SeqAt(nseqs-1);
gPretty.numline = 2;
if (isInterleaved) aFileIndex.Indexit(outfile);
aSeq->DoWriteSelection( outfile, format);
gPretty.numline = 1;
if (isInterleaved) aFileIndex.Indexit(outfile);
aSeq->DoWriteSelection( outfile, format);
gPretty.numline = 0;
}
if (isInterleaved) {
aFileIndex.IndexEOF( outfile);
//tmpfile->Close(); // don't need
//tmpfile->Open("r"); // don't need
DSeqFile::DoInterleave( aFile, tmpfile, format, seqtype, nseqs,
minbases, gLinesPerSeqWritten,
aFileIndex.IndexCount(), aFileIndex.Indices());
outfile= aFile;
//tmpfile->Close(); // don't need
tmpfile->Delete(); // debug tmp
}
DSeqFile::WriteSeqTrailer( aFile, format, this->GetSize(), gLinesPerSeqWritten, minbases);
}
void DSeqList::DoWrite( char* aFileName, short format)
{
gOutputName= aFileName;
DFile* aFile= new DFile( aFileName, "w"); //, "TEXT", fgSire);
aFile->Open();
DoWrite( aFile, format);
aFile->Close();
}
void DSeqList::AddNewSeq()
{
DSequence* aSeq = new DSequence();
InsertLast( aSeq);
//aSeq->fIndex= this->GetSize();
}
void DSeqList::ClearSelections()
{
long i, nseq= GetSize();
for (i= 0; i<nseq; i++)
SeqAt(i)->ClearSelection();
}
short DSeqList::ZeroOrigin()
{
DSequence* aSeq;
short shiftall= 0;
long i, nseq= GetSize();
for (i= 0; i<nseq; i++) {
aSeq= SeqAt(i);
if (aSeq->Origin()<0) shiftall= Min(shiftall, aSeq->Origin());
}
for (i= 0; i<nseq; i++) {
aSeq= SeqAt(i);
aSeq->SetOrigin(aSeq->Origin() - shiftall); // -= shiftall;
if (aSeq->Origin() > 0) {
aSeq->InsertSpacers( 0, aSeq->Origin(), DSequence::indelEdge);
aSeq->SetOrigin(0);
}
}
return shiftall;
}
short dslCompareByName( void* item1, void* item2)
{
char* name1= (char*) ((DSequence*)item1)->Name();
char* name2= (char*) ((DSequence*)item2)->Name();
return StrICmp( name1, name2);
}
short dslCompareBySize( void* item1, void* item2)
{
// inverted sort, largest 1st
long name1= ((DSequence*)item1)->LengthF();
long name2= ((DSequence*)item2)->LengthF();
if (name1 > name2) return -1;
else if (name1 < name2) return 1;
else return 0;
}
short dslCompareByKind( void* item1, void* item2)
{
short name1= ((DSequence*)item1)->Kind();
short name2= ((DSequence*)item2)->Kind();
if (name1 > name2) return 1;
else if (name1 < name2) return -1;
else return 0;
}
short dslCompareByDate( void* item1, void* item2)
{
// inverted sort, latest 1st
unsigned long name1= ((DSequence*)item1)->ModTime();
unsigned long name2= ((DSequence*)item2)->ModTime();
if (name1 > name2) return -1;
else if (name1 < name2) return 1;
else return 0;
}
Boolean DSeqList::SortList(Sorts sortorder)
{
if (sortorder == fSortOrder)
return false;
else {
fSortOrder= sortorder;
switch( sortorder) {
case kSortByKind :
this->SortBy( dslCompareByKind);
break;
case kSortBySize :
this->SortBy( dslCompareBySize);
break;
case kSortByDate :
this->SortBy( dslCompareByDate);
break;
case kSortByName :
this->SortBy( dslCompareByName);
break;
case kSortByItem:
default:
this->Sort();
break;
}
return true;
}
}
DSequence* DSeqList::FindNamedSeq(char* name, Boolean respectCase)
{
if (name && *name) {
DSequence* aSeq;
long i, nseq= GetSize();
if (respectCase) for (i= 0; i<nseq; i++) {
aSeq= SeqAt(i);
if (Nlm_StringCmp(aSeq->Name(), name)==0) return aSeq;
}
else for (i= 0; i<nseq; i++) {
aSeq= SeqAt(i);
if (Nlm_StringICmp(aSeq->Name(), name)==0) return aSeq;
}
}
return NULL;
}
DSequence* DSeqList::Consensus(void)
{
long i, nseq= GetSize();
for (i= 0; i<nseq; i++) {
DSequence* aSeq= SeqAt(i);
if (aSeq->IsConsensus()) return aSeq;
}
return NULL;
}
short DSeqList::ConsensusRow()
{
DSequence* cons= this->Consensus();
if (cons)
return this->GetIdentityItemNo( cons);
else
return -1;
}
void DSeqList::MakeConsensus()
{
DSequence* cons= this->Consensus();
if (!cons) {
cons = new DSequence();
cons->SetName(DSequence::kConsensus);
InsertLast( cons);
}
if (cons) {
//short arow= this->GetIdentityItemNo( cons);
char* hCon= cons->Bases();
long conlen= cons->LengthF(); //StrLen( hCon);
long count= 0;
long i, iseq, nseq= GetSize();
for (iseq= 0; iseq<nseq; iseq++) {
DSequence* aSeq= this->SeqAt(iseq);
if (!aSeq->IsConsensus()
//&& aSeq->Kind() != DSequence::kOtherSeq
&& (aSeq->Kind() == DSequence::kDNA
|| aSeq->Kind() == DSequence::kRNA
|| aSeq->Kind() == DSequence::kNucleic)
) {
count++;
char* hSeq= aSeq->Bases();
long len = aSeq->LengthF();
if (len > conlen) {
conlen= len;
hCon= (char*) MemMore( hCon, conlen);
cons->UpdateBases( hCon, conlen);
}
if (count==1) {
for (i= 0; i<len; i++) hCon[i]= hSeq[i];
}
else {
for (i= 0; i<len; i++)
// !! THis is only good for Nucleic bases, not for Aminos !!
// change to table of matches !!
hCon[i]= DSequence::NucleicConsensus(
DSequence::NucleicBits(hCon[i]),
DSequence::NucleicBits(hSeq[i]));
}
}
}
}
}
char* DSeqList::Distances( short correct, Boolean doSimilarity)
{
struct NucCounts {
float apct, cpct, gpct, tpct;
};
long iseq, nseq= GetSize();
NucCounts* nuccounts = NULL;
char linebuf[256], * buf = NULL;
char name0[128];
if (!nseq) return NULL;
if (correct == DColsen) nuccounts= new NucCounts[ nseq];
buf= (char*) MemNew(1);
*buf= 0;
*name0= 0;
if (correct == DColsen) {
for (iseq=0; iseq<nseq; iseq++) {
long na, nc, ng, nt, ntotal;
DSequence* aSeq= this->SeqAt(iseq);
char* bases= aSeq->Bases();
long ibase, nbases= aSeq->LengthF();
na= nc= ng= nt= ntotal=0;
// create Olsen correction array
for (ibase=0; ibase<nbases; ibase++) {
long baseval= DSequence::NucleicBits(bases[ibase]);
if (baseval & DSequence::kMaskA) na++;
if (baseval & DSequence::kMaskC) nc++;
if (baseval & DSequence::kMaskG) ng++;
if (baseval & DSequence::kMaskT) nt++;
if (baseval) ntotal++;
}
nuccounts[iseq].apct = (float) na / (float) ntotal;
nuccounts[iseq].cpct = (float) nc / (float) ntotal;
nuccounts[iseq].gpct = (float) ng / (float) ntotal;
nuccounts[iseq].tpct = (float) nt / (float) ntotal;
}
}
for (long j=1; j<nseq; j++)
for (long k=0; k<j; k++) {
long jk;
long numer= 0, denom= 0;
DSequence* jseq= this->SeqAt( j);
DSequence* kseq= this->SeqAt( k);
long len= MIN( jseq->LengthF(), kseq->LengthF());
for (jk= 0; jk < len; jk++) {
long jval= jseq->NucleicBits(jseq->Bases()[jk]);
long kval= kseq->NucleicBits(kseq->Bases()[jk]);
if ( jval & kval) numer++;
if ( jval | kval) denom++;
}
float distv, cor;
switch (correct) {
case DColsen:
cor = nuccounts[j].apct * nuccounts[k].apct
+ nuccounts[j].cpct * nuccounts[k].cpct
+ nuccounts[j].gpct * nuccounts[k].gpct
+ nuccounts[j].tpct * nuccounts[k].tpct;
break;
case DCjukes: cor = 0.25; break;
case DCnone:
default: cor = 0.0; break;
}
if (correct == DCnone)
distv= 1.0 - (float)numer / (float)denom;
else
distv= -(1.0-cor) * log( 1.0 / (1.0 - cor)
* ((float)numer / (float)denom - cor) );
if (k == 0) {
if (j == 1) {
sprintf( name0, "%2d=%-15s",k+1,kseq->Name());
//StrExtendCat( &buf, linebuf);
}
sprintf( linebuf, "%2d:%-15s|",j+1,jseq->Name());
StrExtendCat( &buf, linebuf);
}
if (doSimilarity)
sprintf(linebuf, "%6.1f",100 - distv*100.0);
else
sprintf(linebuf, "%6.1f",distv*100.0);
StrExtendCat( &buf, linebuf);
if (k == j-1) {
StrExtendCat( &buf, " \n");
}
}
sprintf( linebuf, " %-15s ","-------------");
StrExtendCat( &buf, linebuf);
for (iseq=1; iseq<nseq; iseq++) {
sprintf(linebuf, " %5s", "----");
StrExtendCat( &buf, linebuf);
}
sprintf( linebuf, "\n%-18s ",name0);
StrExtendCat( &buf, linebuf);
for (iseq=1; iseq<nseq; iseq++) {
sprintf(linebuf, "%5d ",iseq);
StrExtendCat( &buf, linebuf);
}
char * cors, *skind;
switch (correct) {
case DCjukes: cors= "(Jukes correction)"; break;
case DColsen: cors= "(Olsen correction)"; break;
default:
case DCnone: cors= ""; break;
}
if (doSimilarity) skind= "Similarity"; else skind="Distance";
sprintf( linebuf, "\n %s matrix %s\n", skind, cors);
StrExtendCat( &buf, linebuf);
delete[] nuccounts;
return buf;
}
char* DSeqList::FindCommonBases(short minCommonPerCent, char*& firstCommon)
{
enum { kLetrange = 26, kLetlast = 255 };
char * hSeq, * hMaxbase;
char ch;
long len, maxlen, ibase;
short spccount, nilcount;
short maxlet, let, nseq, maxcnt;
short letcount[kLetrange], letfirst[kLetrange];
float rmaxc;
DSequence* cons= this->Consensus();
if (!cons) cons= (DSequence*) this->First(); //any seq will do...
hMaxbase= NULL;
firstCommon= NULL;
if (cons) {
maxlen= cons->LengthF();
hMaxbase= (char*) Nlm_MemGet(maxlen+1,true);
firstCommon= (char*) Nlm_MemGet(maxlen+1,true);
for (ibase= 0; ibase<maxlen; ibase++) {
for (let= 0; let < kLetrange; let++) { letcount[let]= 0; letfirst[let]= kLetlast; }
nilcount= 0;
spccount= 0;
nseq= 0;
long iseq, maxseq= GetSize();
for (iseq=0; iseq<maxseq; iseq++) {
DSequence* aSeq= this->SeqAt(iseq);
if (!aSeq->IsConsensus() && aSeq->Kind() != DSequence::kOtherSeq) {
hSeq= aSeq->Bases();
len= aSeq->LengthF();
if (ibase<len) ch= toupper( hSeq[ibase]);
else ch= 0;
if (ch >= 'A' && ch <= 'Z') {
let = ch - 'A';
letcount[let]++;
if (letfirst[let] == kLetlast) letfirst[let]= Min(kLetlast, iseq);
}
else if (
ch == DSequence::indelSoft
|| ch == DSequence::indelHard
|| ch == DSequence::indelEdge
|| ch == ' ' || ch == '_' )
spccount++;
else
nilcount++;
nseq++;
}
}
for (let= 0, maxlet= 0, maxcnt= -1; let < kLetrange; let++)
if (letcount[let] > maxcnt) {
maxcnt= letcount[let];
maxlet= let;
}
if (maxcnt>0) {
firstCommon[ibase]= letfirst[maxlet];
rmaxc= (100.0 * maxcnt) / nseq;
if (rmaxc >= minCommonPerCent) //mark maxlet's
hMaxbase[ibase]= maxlet + 'A';
else
hMaxbase[ibase]= '!';
}
else {
firstCommon[ibase]= kLetlast;
hMaxbase[ibase]= '!';
}
}
hMaxbase[maxlen]= 0;
firstCommon[maxlen]= 0;
}
return hMaxbase;
}